library(tidyverse)
library(sf)
library(raster)# Load the merged Satellite image for Kazanlak Valley, Bulgaria
# use cds-spatial repo from github https://github.com/CDS-AU-DK/cds-spatial
kaz <-brick("../1_Teaching/cds-spatial/data/Kaz.tif")
plotRGB(kaz, stretch= "lin")crs(kaz)## CRS arguments:
## +proj=utm +zone=35 +datum=WGS84 +units=m +no_defs
# Load prediction data as points (left bottom corner of the evaluated cell)
cnne_df <- read_csv("2021-10-25.predictions/results/east/east.csv")
# 15334 points (origins in the raster cells)
cnnw_df <- read_csv("2021-10-25.predictions/results/west/west.csv")
# 15334 points (origins in the raster cells)
cnn_df <- rbind(cnne_df, cnnw_df)
# 30504 rows
# Check the probabilities of there being a mound in a given cell
hist(cnn_df$mound_probability,
main = "Probability of mound present in a satellite image gridcell");abline(v= 0.8, col="red", lty = 2, lwd = 3)table(cut(cnn_df$mound_probability, c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)))##
## (0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8]
## 7865 7710 6983 4757 2243 653 195 58
## (0.8,0.9] (0.9,1]
## 30 10
# ok, so there is an exponential drop-off. Let’s make the predictions to a grid so we can assess whether gridcells with higher probabilities of mound presence in fact contain burial mounds. To keep the grid lightweight, let’s start with those gridcells that have values of mound probability at 80% and higher.
### Build a grid of those with 80%+ probability of containing a mound
# Build a grid from all points
#cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")
# Filter predictions to those that have 80+% likelihood of containing a mound
cnn80_sp <- cnn_df %>%
filter(mound_probability >0.799) %>% #333 observations
st_as_sf(coords = c("x","y"), crs = 32635)
# Make a grid of 80%+ cells, 382 cells, 332 unique ones
cnn_grid80 <- st_make_grid(cnn80_sp, cellsize = 250, what = "polygons")
#plot(cnn_grid80)
# Add probability data to the 80%+ grid
#cnnall_datagrid <- st_join(st_sf(cnnall_grid), cnnall_sp)
cnn_grid80 <- st_join(st_sf(cnn_grid80), cnn80_sp)
# Visualize the grid cells with higher probability
ggplot(cnn_grid80) +
geom_sf(aes(color = mound_probability))# 333 raster cells are predicted to contain mounds with greater
# than 80% likelihood. Archaeologists found 773 mounds in fraction of the area
# View the grid
plotRGB(kaz, stretch= "lin");
plot(cnn_grid80$geometry, add = TRUE, border = "white")that we will need later for validation
# Bring in all the mounds observed in the field
mounds <- st_read("../1_Teaching/cds-spatial/data/KAZ_mounds.shp")## Reading layer `KAZ_mounds' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_mounds.shp' using driver `ESRI Shapefile'
## Simple feature collection with 773 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS: WGS 84 / UTM zone 35N
plot(mounds$geometry, main = "Mounds found through survey")# Bring in survey area to see overall coverage
survey <- st_read("../1_Teaching/cds-spatial/data/KAZ_surveyarea.shp")## Reading layer `KAZ_surveyarea' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_surveyarea.shp' using driver `ESRI Shapefile'
## Simple feature collection with 98 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 352181.7 ymin: 4712923 xmax: 368890.2 ymax: 4733359
## projected CRS: WGS 84 / UTM zone 35N
plot(survey$geometry, main = "Area covered by survey")# Extrapolate rough study area after subtracting outliers
far <- mounds %>%
st_is_within_distance(survey, 1500) %>%
lengths>0
farmounds <- which(far==0) # these are the mounds that are mostly far from survey area
# convex hull of points without outliers
survey_sm <- st_convex_hull(st_union(mounds$geometry[-farmounds]))
# convex hull of survey polygons
survey_ch <- st_convex_hull(st_union(survey$geometry))
# quickly see the difference in bounding boxes
plot(survey_ch, col = "green", main = "Convex hull around survey area (tight and extensive)");plot(survey_sm, col = "red", alpha=0.5, add = TRUE); plot(survey$geometry, add =TRUE)# View all mounds
plotRGB(kaz, stretch= "lin");
plot(survey_ch, col = "green", add = TRUE);
plot(survey_sm, col = "red", add = TRUE);
plot(survey$geometry, col = "lightyellow", add = TRUE );
plot(mounds$geometry[-farmounds,], add = TRUE, col = "pink");
plot(cnn_grid80$geometry, add = TRUE, border = "white");
legend("bottom", legend = "Grid cells with 80%+ probability of mounds (white squares) overlayed on \nthe satellite image and survey study area outlines with mounds (pink circles)")The Tundzha Regional Archaeological project surveyed 85 sq km of the area included in the satellite imagery in 2009-2011, documenting the burial mounds within which create a dataset suitable for validation.
Area surveyed contains 773 documented mounds of various shapes and sizes. This area also contains 192 of 332 grid cells of 250m a side which were allocated 0.6+ probability of containing a mound. Let’s explore how many mounds are outside these grid cells and which mounds were predicted and which not. Bring in mound attributes and check correlation. Does size play a role?
# Bring in mound attributes (n = 773) to aid exploration
mounddata <- read_csv("../1_Teaching/cds-spatial/data/KAZ_mdata.csv")
mounds <- mounds %>%
left_join(mounddata, by = c("TRAP_Code"="MoundID"))Let’s see which of the grid cells with high probability of containing a mound are in the TRAP study area?
# Select gridcells that fall within TRAP study area
survey_grids80 <- st_intersection(survey_ch, cnn_grid80) # 40 grid cells have 80%+ likelihood of containing mound inside the study area
#plot(survey_grids80)Archaeological survey documented 773 mounds in the study area. Mounds from this dataset that fall in the 80%+ probability gridcells form the true positives and comprise the success of the CNN model.
# Check spatial overlap between 80%+ grid cells and all 773 mounds
predicted80 <- st_intersection(mounds, cnn_grid80)# %>% distinct()
# Which ones are not-predicted?
`%nin%` = Negate(`%in%`)
unpredicted80 <- mounds[mounds$TRAP_Code%nin%predicted80$TRAP_Code,] # 627 not predicted
paste0("there are ", length(unique(predicted80$TRAP_Code)), " unique mounds detected within the ",length(unique(cnn_grid80$X1)), " unique gridcells.")## [1] "there are 25 unique mounds detected within the 40 unique gridcells."
head(predicted80,2) # predicted feature snippet## Simple feature collection with 2 features and 11 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 360896.8 ymin: 4713055 xmax: 360896.8 ymax: 4713055
## projected CRS: WGS 84 / UTM zone 35N
## Notes TRAP_Code Cluster Lat Long Condition Robbed Height LandUse
## 82 Mound 4066 3 42.55737 25.30552 3 1 5 Pasture
## 82.1 Mound 4066 3 42.55737 25.30552 3 1 5 Pasture
## X1 mound_probability geometry
## 82 4748 0.8855298 POINT (360896.8 4713055)
## 82.1 4749 0.8916270 POINT (360896.8 4713055)
grids_n_mounds <- predicted80 %>%
# st_drop_geometry() %>%
group_by(X1) %>%
count()
length(unique(grids_n_mounds$X1)) # all the detected mounds are withing 51 grids, out of which one is labelled NA, so 50 it is## [1] 11
hist(grids_n_mounds$n, breaks = 40,
xlab = "Mound count in grid",
ylab = "Grid count",
main = paste0("Distribution of predicted mounds (n = ",length(unique(predicted80$TRAP_Code)), ") \n across ",nrow(grids_n_mounds)," TP grid cells among a total of ",length(unique(cnn_grid80$X1))," with 80% likelihood")) Update this: # Summary: 11 out of 40 grid cells (0.275) in the study area with 80%+ probability contain between 1 and 29 detected mounds. The high number makes sense because of the dense necropolis of little 10m diameter mounds in the NW of the valley.
The previous chunk shows that 25 of 773 unique mounds appear in 11 of the 40 (25%) “survey” grids that have 80% or higher probability of mounds and are at least partially within the TRAP study area. The intersection actually shows 34 intersecting mound points, but some fall on the border of two gridcells and are double-counted. The entire satellite image contains 40 grids with 80%+ probability of mounds, 40 fall in the TRAP study area and 11 of these contain 25 mounds (0.0323415% of total).
# Add the information on prediction to mound dataset
mounds <- mounds %>%
mutate(predicted80 =
case_when(TRAP_Code %in% predicted80$TRAP_Code ~ "yes", TRAP_Code %nin% predicted80$TRAP_Code ~ "no"))
# Look at the predicted and unpredicted mounds
plot(unpredicted80$geometry, col = "red", main = "Mounds predicted (green) and unpredicted (red) by CNN");plot(predicted80$geometry, col = "darkgreen", add= TRUE); Let’s look at whether mound Height is a factor impacting detectability. (it is a factor for visual inspection as higher mounds are more visible in sat img.)
# filter the mounds for largesh and noticeable ones (2+ meters)
large <- mounds %>%
filter(Height>=2) # 247 obs
mounds %>%
group_by(predicted80) %>%
summarize(min = min(Height), max = max(Height))## Simple feature collection with 2 features and 3 fields
## geometry type: MULTIPOINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS: WGS 84 / UTM zone 35N
## # A tibble: 2 x 4
## predicted80 min max geometry
## <chr> <dbl> <dbl> <MULTIPOINT [m]>
## 1 no 0 20 ((352481.3 4724776), (352488.8 4724747), (352489.3 47~
## 2 yes 0.1 7 ((353488.9 4725971), (353521.2 4725932), (353665.3 47~
# Check the height difference between un- and predicted mounds
boxplot(Height~predicted80, data = mounds, xlab = "Was mound detected?")# Check height distribution btw un- and predicted mounds
hist(mounds$Height[mounds$predicted80 == "no"], col = "pink", breaks = 20,main = "Histogram of detected (yellow) and undetected (pink) mounds by height",
xlab = "Height (m)");
hist(mounds$Height[mounds$predicted80 == "yes"], col = "yellow", add =TRUE, alpha = 0.5);# Is there a significant relationship between the Height of a mound and its detection?
t.test(Height ~ as.factor(predicted80), data = mounds)##
## Welch Two Sample t-test
##
## data: Height by as.factor(predicted80)
## t = -0.60744, df = 25.8, p-value = 0.5489
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -1.1695773 0.6361549
## sample estimates:
## mean in group no mean in group yes
## 1.733289 2.000000
str(t.test(Height ~ as.factor(predicted80), data = mounds))## List of 10
## $ statistic : Named num -0.607
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 25.8
## ..- attr(*, "names")= chr "df"
## $ p.value : num 0.549
## $ conf.int : num [1:2] -1.17 0.636
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num [1:2] 1.73 2
## ..- attr(*, "names")= chr [1:2] "mean in group no" "mean in group yes"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in means between group no and group yes"
## $ stderr : num 0.439
## $ alternative: chr "two.sided"
## $ method : chr "Welch Two Sample t-test"
## $ data.name : chr "Height by as.factor(predicted80)"
## - attr(*, "class")= chr "htest"
# ... no, as the p at 0.5 is not significantGiven 25 mounds were predicted by CNN, where are the remaining 748? Let’s explore where the missing mounds are and what probability has been assigned to the cells that contain them by CNN model.
# Create a grid of ALL the predictions to see where the remaining 627 unpredicted mounds are
cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#plot(cnnall_sp)
cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")
#plot(cnnall_grid) # we can see that the cells overlap in the middle
cnnall_grid <- st_join(st_sf(cnnall_grid), cnnall_sp) # add attributes
# Trying to eliminate overlap in gridcells
# eliminating id duplicates is nonsensical as the two datasets have same ids, I must aggregate spatially, e.g. st_union or st_difference, or relabel the datasets before merging, or crop to a shared non-overlapping border.
missing80 <- st_intersection(unpredicted80, cnnall_grid) # 2158, clearly duplicates
# Check for duplicates
?distinct()
length(duplicated(missing80$TRAP_Code)) # 2158 points to duplication## [1] 2606
missing80 <- missing80[!duplicated(missing80$TRAP_Code),] #620 undetected moundsHow many grids are the 741-8 missing mounds located within?
# How many grids are the missing mounds in?
missing80 %>%
st_drop_geometry() %>%
group_by(X1) %>%
tally() %>% arrange(desc(n))## # A tibble: 279 x 2
## X1 n
## <dbl> <int>
## 1 7551 28
## 2 7921 27
## 3 9209 15
## 4 7366 13
## 5 8844 13
## 6 8292 12
## 7 8294 12
## 8 9766 12
## 9 7367 11
## 10 7552 11
## # ... with 269 more rows
# The missing 740 mounds are contained within 279 grid cells of the satellite image
# Let's see the distribution of missing mounds per grid cell
missing80 %>%
group_by(X1) %>%
count() %>%
ggplot()+ geom_histogram(aes(n))+ggtitle("Count of undetected mounds (748) per gridcell(n=259)") + theme_bw()# some gridcells contain up to 28 undetected mounds. No surprise with the NW necropolisUndetected mounds appear in 259 gridcells, which contain between 1 and 29 mounds, heavily skewed to the left.
What is the probability of the cells that contain undetected mounds?
# What is the mound_probability in these gridcells?
hist(missing80$mound_probability,
xlab = "Cnn-generated probability of mound",
main = "Probability assigned to gridcells containing undetected mounds")Somehow 5 of the undetected mounds still have high mound-probability!? How have these snuck through? Maybe they are outside the study area?
Probability in five undetected mounds is showing to be over 0.59 when it should be below! Let’s visually inspect what is going on and investigate:
# Ggplot of mounds with high prob and grid with high prob
# missing %>%
# filter(mound_probability>0.59) %>%
# ggplot()+
# geom_sf(aes(color = mound_probability))+
# geom_sf(data = cnn_grid80, alpha = 0.5)
library(mapview)
missing80 %>%
filter(mound_probability>0.59) %>%
mapview(color = "red")+
# mapview(cnn_grid80, alpha = 0.5)+
mapview(cnnall_grid, zcol= "mound_probability")